/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "../include/userdict.h"
#include "../include/splparser.h"
#include "../include/ngram.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#ifdef ___DEBUG_PERF___
#include <cutils/log.h>
#endif
 //#*#* #include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <assert.h>
#include <ctype.h>
#include <sys/types.h>

//#*#*{{
#ifdef WIN32
#   include <windows.h>
#   include <io.h>
#   include <process.h>
#   include <synchapi.h>

struct CCriticalSection
{
	CCriticalSection() { InitializeCriticalSection(&m_cs); }
	~CCriticalSection() { DeleteCriticalSection(&m_cs); }
	void lock() { EnterCriticalSection(&m_cs); }
	void unlock() { LeaveCriticalSection(&m_cs); }
	bool trylock() { return TryEnterCriticalSection(&m_cs); }
private:
	CRITICAL_SECTION m_cs;
};

int gettimeofday(struct timeval *tp, void *tzp)
{
	time_t clock;
	struct tm tm;
	SYSTEMTIME wtm;
	GetLocalTime(&wtm);
	tm.tm_year = wtm.wYear - 1900;
	tm.tm_mon = wtm.wMonth - 1;
	tm.tm_mday = wtm.wDay;
	tm.tm_hour = wtm.wHour;
	tm.tm_min = wtm.wMinute;
	tm.tm_sec = wtm.wSecond;
	tm.tm_isdst = -1;
	clock = mktime(&tm);
	tp->tv_sec = clock;
	tp->tv_usec = wtm.wMilliseconds * 1000;
	return (0);
}

#else
#   include <sys/time.h>
#   include <unistd.h>
#   include <pthread.h>
#endif
//#*#* #include <sys/time.h>
//#*#*}}
#include <time.h>
#include <math.h>

namespace ime_pinyin {

#ifdef ___DEBUG_PERF___
	static uint64 _ellapse_ = 0;
	static struct timeval _tv_start_, _tv_end_;
#define DEBUG_PERF_BEGIN \
    do { \
      gettimeofday(&_tv_start_, NULL); \
    } while (0)
#define DEBUG_PERF_END \
    do { \
      gettimeofday(&_tv_end_, NULL); \
      _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
                  (_tv_end_.tv_usec - _tv_start_.tv_usec); \
    } while (0)
#define LOGD_PERF(message) \
    ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
#else
#define DEBUG_PERF_BEGIN
#define DEBUG_PERF_END
#define LOGD_PERF(message)
#endif

	// XXX File load and write are thread-safe by g_mutex_
#ifdef _WIN32
	static CCriticalSection g_mutex_;
#define pthread_mutex_lock(MUTEX) ((MUTEX)->lock())
#define pthread_mutex_unlock(MUTEX) ((MUTEX)->unlock())
#define pthread_mutex_trylock(MUTEX) (!(MUTEX)->trylock())
#else
	static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
#endif
	static struct timeval g_last_update_ = { 0, 0 };

	inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
		return (4 + info->lemma_size + (info->lemma_count << 3)
#ifdef ___PREDICT_ENABLED___
			+ (info->lemma_count << 2)
#endif
#ifdef ___SYNC_ENABLED___
			+ (info->sync_count << 2)
#endif
			+ sizeof(*info));
	}

	inline LmaScoreType UserDict::translate_score(int raw_score) {
		// 1) ori_freq: original user frequency
		uint32 ori_freq = extract_score_freq(raw_score);
		// 2) lmt_off: lmt index (week offset for example)
		uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
		if (kUserDictLMTBitWidth < 16) {
			uint64 mask = ~(1 << kUserDictLMTBitWidth);
			lmt_off &= mask;
		}
		// 3) now_off: current time index (current week offset for example)
		// assuming load_time_ is around current time
		uint64 now_off = load_time_.tv_sec;
		now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
		now_off = (now_off << (64 - kUserDictLMTBitWidth));
		now_off = (now_off >> (64 - kUserDictLMTBitWidth));
		// 4) factor: decide expand-factor
		int delta = now_off - lmt_off;
		if (delta > 4)
			delta = 4;
		int factor = 80 - (delta << 4);

		double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
		return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
			* NGram::kLogValueAmplifier);
	}

	inline int UserDict::extract_score_freq(int raw_score) {
		// Frequence stored in lowest 16 bits
		int freq = (raw_score & 0x0000ffff);
		return freq;
	}

	inline uint64 UserDict::extract_score_lmt(int raw_score) {
		uint64 lmt = ((raw_score & 0xffff0000) >> 16);
		if (kUserDictLMTBitWidth < 16) {
			uint64 mask = ~(1 << kUserDictLMTBitWidth);
			lmt &= mask;
		}
		lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
		return lmt;
	}

	inline int UserDict::build_score(uint64 lmt, int freq) {
		lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
		lmt = (lmt << (64 - kUserDictLMTBitWidth));
		lmt = (lmt >> (64 - kUserDictLMTBitWidth));
		uint16 lmt16 = (uint16)lmt;
		int s = freq;
		s &= 0x0000ffff;
		s = (lmt16 << 16) | s;
		return s;
	}

	inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
		int64 ret = 0;
		if (len <= 0)
			return ret;

		int flag = 1;
		const uint16 * endp = s + len;
		if (*s == '-') {
			flag = -1;
			s++;
		}
		else if (*s == '+') {
			s++;
		}

		while (*s >= '0' && *s <= '9' && s < endp) {
			ret += ret * 10 + (*s) - '0';
			s++;
		}
		return ret * flag;
	}

	inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
		if (!s || size <= 0)
			return 0;
		uint16 *endp = s + size;
		int ret_len = 0;
		if (v < 0) {
			*(s++) = '-';
			++ret_len;
			v *= -1;
		}

		uint16 *b = s;
		while (s < endp && v != 0) {
			*(s++) = '0' + (v % 10);
			v = v / 10;
			++ret_len;
		}

		if (v != 0)
			return 0;

		--s;

		while (b < s) {
			*b = *s;
			++b, --s;
		}

		return ret_len;
	}

	inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
		offset &= kUserDictOffsetMask;
		lemmas_[offset] |= flag;
	}

	inline char UserDict::get_lemma_flag(uint32 offset) {
		offset &= kUserDictOffsetMask;
		return (char)(lemmas_[offset]);
	}

	inline char UserDict::get_lemma_nchar(uint32 offset) {
		offset &= kUserDictOffsetMask;
		return (char)(lemmas_[offset + 1]);
	}

	inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
		offset &= kUserDictOffsetMask;
		return (uint16 *)(lemmas_ + offset + 2);
	}

	inline uint16 * UserDict::get_lemma_word(uint32 offset) {
		offset &= kUserDictOffsetMask;
		uint8 nchar = get_lemma_nchar(offset);
		return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
	}

	inline LemmaIdType UserDict::get_max_lemma_id() {
		// When a lemma is deleted, we don't not claim its id back for
		// simplicity and performance
		return start_id_ + dict_info_.lemma_count - 1;
	}

	inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
		if (id >= start_id_ && id <= get_max_lemma_id())
			return true;
		return false;
	}

	inline bool UserDict::is_valid_state() {
		if (state_ == USER_DICT_NONE)
			return false;
		return true;
	}

	UserDict::UserDict()
		: start_id_(0),
		version_(0),
		lemmas_(NULL),
		offsets_(NULL),
		scores_(NULL),
		ids_(NULL),
#ifdef ___PREDICT_ENABLED___
		predicts_(NULL),
#endif
#ifdef ___SYNC_ENABLED___
		syncs_(NULL),
		sync_count_size_(0),
#endif
		offsets_by_id_(NULL),
		lemma_count_left_(0),
		lemma_size_left_(0),
		dict_file_(NULL),
		state_(USER_DICT_NONE) {
		memset(&dict_info_, 0, sizeof(dict_info_));
		memset(&load_time_, 0, sizeof(load_time_));
#ifdef ___CACHE_ENABLED___
		cache_init();
#endif
	}

	UserDict::~UserDict() {
		close_dict();
	}

	bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
		LemmaIdType end_id) {
#ifdef ___DEBUG_PERF___
		DEBUG_PERF_BEGIN;
#endif
		dict_file_ = strdup(file_name);
		if (!dict_file_)
			return false;

		start_id_ = start_id;

		if (false == validate(file_name) && false == reset(file_name)) {
			goto error;
		}
		if (false == load(file_name, start_id)) {
			goto error;
		}

		state_ = USER_DICT_SYNC;

		gettimeofday(&load_time_, NULL);

#ifdef ___DEBUG_PERF___
		DEBUG_PERF_END;
		LOGD_PERF("load_dict");
#endif
		return true;
	error:
		free((void*)dict_file_);
		dict_file_ = NULL;
		start_id_ = 0;
		return false;
	}

	bool UserDict::close_dict() {
		if (state_ == USER_DICT_NONE)
			return true;
		if (state_ == USER_DICT_SYNC)
			goto out;

		// If dictionary is written back by others,
		// we can not simply write back here
		// To do a safe flush, we have to discard all newly added
		// lemmas and try to reload dict file.
		pthread_mutex_lock(&g_mutex_);
		if (load_time_.tv_sec > g_last_update_.tv_sec ||
			(load_time_.tv_sec == g_last_update_.tv_sec &&
				load_time_.tv_usec > g_last_update_.tv_usec)) {
			write_back();
			gettimeofday(&g_last_update_, NULL);
		}
		pthread_mutex_unlock(&g_mutex_);

	out:
		free((void*)dict_file_);
		free(lemmas_);
		free(offsets_);
		free(offsets_by_id_);
		free(scores_);
		free(ids_);
#ifdef ___PREDICT_ENABLED___
		free(predicts_);
#endif

		version_ = 0;
		dict_file_ = NULL;
		lemmas_ = NULL;
#ifdef ___SYNC_ENABLED___
		syncs_ = NULL;
		sync_count_size_ = 0;
#endif
		offsets_ = NULL;
		offsets_by_id_ = NULL;
		scores_ = NULL;
		ids_ = NULL;
#ifdef ___PREDICT_ENABLED___
		predicts_ = NULL;
#endif

		memset(&dict_info_, 0, sizeof(dict_info_));
		lemma_count_left_ = 0;
		lemma_size_left_ = 0;
		state_ = USER_DICT_NONE;

		return true;
	}

	size_t UserDict::number_of_lemmas() {
		return dict_info_.lemma_count;
	}

	void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
		return;
	}

	MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
		const DictExtPara *dep,
		LmaPsbItem *lpi_items,
		size_t lpi_max, size_t *lpi_num) {
		if (is_valid_state() == false)
			return 0;

		bool need_extend = false;

#ifdef ___DEBUG_PERF___
		DEBUG_PERF_BEGIN;
#endif
		*lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
			lpi_items, lpi_max, &need_extend);
#ifdef ___DEBUG_PERF___
		DEBUG_PERF_END;
		LOGD_PERF("extend_dict");
#endif
		return ((*lpi_num > 0 || need_extend) ? 1 : 0);
	}

	int UserDict::is_fuzzy_prefix_spell_id(
		const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
		if (len1 < searchable->splids_len)
			return 0;

		SpellingTrie &spl_trie = SpellingTrie::get_instance();
		uint32 i = 0;
		for (i = 0; i < searchable->splids_len; i++) {
			const char py1 = *spl_trie.get_spelling_str(id1[i]);
			uint16 off = 8 * (i % 4);
			const char py2 = ((searchable->signature[i / 4] & (0xff << off)) >> off);
			if (py1 == py2)
				continue;
			return 0;
		}
		return 1;
	}

	int UserDict::fuzzy_compare_spell_id(
		const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
		if (len1 < searchable->splids_len)
			return -1;
		if (len1 > searchable->splids_len)
			return 1;

		SpellingTrie &spl_trie = SpellingTrie::get_instance();
		uint32 i = 0;
		for (i = 0; i < len1; i++) {
			const char py1 = *spl_trie.get_spelling_str(id1[i]);
			uint16 off = 8 * (i % 4);
			const char py2 = ((searchable->signature[i / 4] & (0xff << off)) >> off);
			if (py1 == py2)
				continue;
			if (py1 > py2)
				return 1;
			return -1;
		}
		return 0;
	}

	bool UserDict::is_prefix_spell_id(
		const uint16 * fullids, uint16 fulllen,
		const UserDictSearchable *searchable) {
		if (fulllen < searchable->splids_len)
			return false;

		uint32 i = 0;
		for (; i < searchable->splids_len; i++) {
			uint16 start_id = searchable->splid_start[i];
			uint16 count = searchable->splid_count[i];
			if (fullids[i] >= start_id && fullids[i] < start_id + count)
				continue;
			else
				return false;
		}
		return true;
	}

	bool UserDict::equal_spell_id(
		const uint16 * fullids, uint16 fulllen,
		const UserDictSearchable *searchable) {
		if (fulllen != searchable->splids_len)
			return false;

		uint32 i = 0;
		for (; i < fulllen; i++) {
			uint16 start_id = searchable->splid_start[i];
			uint16 count = searchable->splid_count[i];
			if (fullids[i] >= start_id && fullids[i] < start_id + count)
				continue;
			else
				return false;
		}
		return true;
	}

	int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
		int32 begin = 0;
		int32 end = dict_info_.lemma_count - 1;
		int32 middle = -1;

		int32 first_prefix = middle;
		int32 last_matched = middle;

		while (begin <= end) {
			middle = (begin + end) >> 1;
			uint32 offset = offsets_[middle];
			uint8 nchar = get_lemma_nchar(offset);
			const uint16 * splids = get_lemma_spell_ids(offset);
			int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
			int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);

			if (pre)
				first_prefix = middle;

			if (cmp < 0) {
				begin = middle + 1;
			}
			else if (cmp > 0) {
				end = middle - 1;
			}
			else {
				end = middle - 1;
				last_matched = middle;
			}
		}

		return first_prefix;
	}

	void UserDict::prepare_locate(UserDictSearchable *searchable,
		const uint16 *splid_str,
		uint16 splid_str_len) {
		searchable->splids_len = splid_str_len;
		memset(searchable->signature, 0, sizeof(searchable->signature));

		SpellingTrie &spl_trie = SpellingTrie::get_instance();
		uint32 i = 0;
		for (; i < splid_str_len; i++) {
			if (spl_trie.is_half_id(splid_str[i])) {
				searchable->splid_count[i] =
					spl_trie.half_to_full(splid_str[i],
						&(searchable->splid_start[i]));
			}
			else {
				searchable->splid_count[i] = 1;
				searchable->splid_start[i] = splid_str[i];
			}
			const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
			searchable->signature[i >> 2] |= (py << (8 * (i % 4)));
		}
	}

	size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
		LmaPsbItem *lpi_items, size_t lpi_max) {
		return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
	}

	size_t UserDict::_get_lpis(const uint16 *splid_str,
		uint16 splid_str_len, LmaPsbItem *lpi_items,
		size_t lpi_max, bool * need_extend) {
		bool tmp_extend;
		if (!need_extend)
			need_extend = &tmp_extend;

		*need_extend = false;

		if (is_valid_state() == false)
			return 0;
		if (lpi_max <= 0)
			return 0;

		if (0 == pthread_mutex_trylock(&g_mutex_)) {
			if (load_time_.tv_sec < g_last_update_.tv_sec ||
				(load_time_.tv_sec == g_last_update_.tv_sec &&
					load_time_.tv_usec < g_last_update_.tv_usec)) {
				// Others updated disk file, have to reload
				pthread_mutex_unlock(&g_mutex_);
				flush_cache();
			}
			else {
				pthread_mutex_unlock(&g_mutex_);
			}
		}
		else {
		}

		UserDictSearchable searchable;
		prepare_locate(&searchable, splid_str, splid_str_len);

		uint32 max_off = dict_info_.lemma_count;
#ifdef ___CACHE_ENABLED___
		int32 middle;
		uint32 start, count;
		bool cached = cache_hit(&searchable, &start, &count);
		if (cached) {
			middle = start;
			max_off = start + count;
		}
		else {
			middle = locate_first_in_offsets(&searchable);
			start = middle;
		}
#else
		int32 middle = locate_first_in_offsets(&searchable);
#endif

		if (middle == -1) {
#ifdef ___CACHE_ENABLED___
			if (!cached)
				cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
#endif
			return 0;
		}

		size_t lpi_current = 0;

		bool fuzzy_break = false;
		bool prefix_break = false;
		while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
			if (lpi_current >= lpi_max)
				break;
			uint32 offset = offsets_[middle];
			// Ignore deleted lemmas
			if (offset & kUserDictOffsetFlagRemove) {
				middle++;
				continue;
			}
			uint8 nchar = get_lemma_nchar(offset);
			uint16 * splids = get_lemma_spell_ids(offset);
#ifdef ___CACHE_ENABLED___
			if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
#else
			if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
#endif
				fuzzy_break = true;
			}

			if (prefix_break == false) {
				if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
					if (*need_extend == false &&
						is_prefix_spell_id(splids, nchar, &searchable)) {
						*need_extend = true;
					}
				}
				else {
					prefix_break = true;
				}
			}

			if (equal_spell_id(splids, nchar, &searchable) == true) {
				lpi_items[lpi_current].psb = translate_score(scores_[middle]);
				lpi_items[lpi_current].id = ids_[middle];
				lpi_items[lpi_current].lma_len = nchar;
				lpi_current++;
			}
			middle++;
		}

#ifdef ___CACHE_ENABLED___
		if (!cached) {
			count = middle - start;
			cache_push(USER_DICT_CACHE, &searchable, start, count);
		}
#endif

		return lpi_current;
	}

	uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
		uint16 str_max) {
		if (is_valid_state() == false)
			return 0;
		if (is_valid_lemma_id(id_lemma) == false)
			return 0;
		uint32 offset = offsets_by_id_[id_lemma - start_id_];
		uint8 nchar = get_lemma_nchar(offset);
		char16 * str = get_lemma_word(offset);
		uint16 m = nchar < str_max - 1 ? nchar : str_max - 1;
		int i = 0;
		for (; i < m; i++) {
			str_buf[i] = str[i];
		}
		str_buf[i] = 0;
		return m;
	}

	uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
		uint16 splids_max, bool arg_valid) {
		if (is_valid_lemma_id(id_lemma) == false)
			return 0;
		uint32 offset = offsets_by_id_[id_lemma - start_id_];
		uint8 nchar = get_lemma_nchar(offset);
		const uint16 * ids = get_lemma_spell_ids(offset);
		int i = 0;
		for (; i < nchar && i < splids_max; i++)
			splids[i] = ids[i];
		return i;
	}

	size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
		NPredictItem *npre_items, size_t npre_max,
		size_t b4_used) {
		uint32 new_added = 0;
#ifdef ___PREDICT_ENABLED___
		int32 end = dict_info_.lemma_count - 1;
		int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
		if (j == -1)
			return 0;

		while (j <= end) {
			uint32 offset = predicts_[j];
			// Ignore deleted lemmas
			if (offset & kUserDictOffsetFlagRemove) {
				j++;
				continue;
			}
			uint32 nchar = get_lemma_nchar(offset);
			uint16 * words = get_lemma_word(offset);
			uint16 * splids = get_lemma_spell_ids(offset);

			if (nchar <= hzs_len) {
				j++;
				continue;
			}

			if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
				if (new_added >= npre_max) {
					return new_added;
				}
				uint32 cpy_len =
					(nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
					- (hzs_len << 1);
				npre_items[new_added].his_len = hzs_len;
				npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
				memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
				if ((cpy_len >> 1) < kMaxPredictSize) {
					npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
				}
				new_added++;
			}
			else {
				break;
			}

			j++;
		}
#endif
		return new_added;
	}

	int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
		uint16 lemma_len) {
		int32 max_off = dict_info_.lemma_count;

		UserDictSearchable searchable;
		prepare_locate(&searchable, splid_str, lemma_len);
#ifdef ___CACHE_ENABLED___
		int32 off;
		uint32 start, count;
		bool cached = load_cache(&searchable, &start, &count);
		if (cached) {
			off = start;
			max_off = start + count;
		}
		else {
			off = locate_first_in_offsets(&searchable);
			start = off;
		}
#else
		int32 off = locate_first_in_offsets(&searchable);
#endif

		if (off == -1) {
			return off;
		}

		while (off < max_off) {
			uint32 offset = offsets_[off];
			if (offset & kUserDictOffsetFlagRemove) {
				off++;
				continue;
			}
			uint16 * splids = get_lemma_spell_ids(offset);
#ifdef ___CACHE_ENABLED___
			if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
				break;
#else
			if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
				break;
#endif
			if (equal_spell_id(splids, lemma_len, &searchable) == true) {
				uint16 * str = get_lemma_word(offset);
				uint32 i = 0;
				for (i = 0; i < lemma_len; i++) {
					if (str[i] == lemma_str[i])
						continue;
					break;
				}
				if (i < lemma_len) {
					off++;
					continue;
				}
#ifdef ___CACHE_ENABLED___
				// No need to save_cache here, since current function is invoked by
				// put_lemma. It's rarely possible for a user input same lemma twice.
				// That means first time user type a new lemma, it is newly added into
				// user dictionary, then it's possible that user type the same lemma
				// again.
				// Another reason save_cache can not be invoked here is this function
				// aborts when lemma is found, and it never knows the count.
#endif
				return off;
			}
			off++;
		}

		return -1;
	}

#ifdef ___PREDICT_ENABLED___
	uint32 UserDict::locate_where_to_insert_in_predicts(
		const uint16 * words, int lemma_len) {
		int32 begin = 0;
		int32 end = dict_info_.lemma_count - 1;
		int32 middle = end;

		uint32 last_matched = middle;

		while (begin <= end) {
			middle = (begin + end) >> 1;
			uint32 offset = offsets_[middle];
			uint8 nchar = get_lemma_nchar(offset);
			const uint16 * ws = get_lemma_word(offset);

			uint32 minl = nchar < lemma_len ? nchar : lemma_len;
			uint32 k = 0;
			int cmp = 0;

			for (; k < minl; k++) {
				if (ws[k] < words[k]) {
					cmp = -1;
					break;
				}
				else if (ws[k] > words[k]) {
					cmp = 1;
					break;
				}
			}
			if (cmp == 0) {
				if (nchar < lemma_len)
					cmp = -1;
				else if (nchar > lemma_len)
					cmp = 1;
			}

			if (cmp < 0) {
				begin = middle + 1;
				last_matched = middle;
			}
			else if (cmp > 0) {
				end = middle - 1;
			}
			else {
				end = middle - 1;
				last_matched = middle;
			}
		}

		return last_matched;
	}

	int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
		int32 begin = 0;
		int32 end = dict_info_.lemma_count - 1;
		int32 middle = -1;

		int32 last_matched = middle;

		while (begin <= end) {
			middle = (begin + end) >> 1;
			uint32 offset = offsets_[middle];
			uint8 nchar = get_lemma_nchar(offset);
			const uint16 * ws = get_lemma_word(offset);

			uint32 minl = nchar < lemma_len ? nchar : lemma_len;
			uint32 k = 0;
			int cmp = 0;

			for (; k < minl; k++) {
				if (ws[k] < words[k]) {
					cmp = -1;
					break;
				}
				else if (ws[k] > words[k]) {
					cmp = 1;
					break;
				}
			}
			if (cmp == 0) {
				if (nchar >= lemma_len)
					last_matched = middle;
				if (nchar < lemma_len)
					cmp = -1;
				else if (nchar > lemma_len)
					cmp = 1;
			}

			if (cmp < 0) {
				begin = middle + 1;
			}
			else if (cmp > 0) {
				end = middle - 1;
			}
			else {
				end = middle - 1;
			}
		}

		return last_matched;
	}

#endif

	LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
		uint16 lemma_len) {
		int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
		if (off == -1) {
			return 0;
		}

		return ids_[off];
	}

	LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
		if (is_valid_state() == false)
			return 0;
		if (is_valid_lemma_id(lemma_id) == false)
			return 0;

		return translate_score(_get_lemma_score(lemma_id));
	}

	LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
		uint16 lemma_len) {
		if (is_valid_state() == false)
			return 0;
		return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
	}

	int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
		if (is_valid_state() == false)
			return 0;
		if (is_valid_lemma_id(lemma_id) == false)
			return 0;

		uint32 offset = offsets_by_id_[lemma_id - start_id_];

		uint32 nchar = get_lemma_nchar(offset);
		uint16 * spl = get_lemma_spell_ids(offset);
		uint16 * wrd = get_lemma_word(offset);

		int32 off = locate_in_offsets(wrd, spl, nchar);
		if (off == -1) {
			return 0;
		}

		return scores_[off];
	}

	int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
		uint16 lemma_len) {
		if (is_valid_state() == false)
			return 0;

		int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
		if (off == -1) {
			return 0;
		}

		return scores_[off];
	}

#ifdef ___SYNC_ENABLED___
	void UserDict::remove_lemma_from_sync_list(uint32 offset) {
		offset &= kUserDictOffsetMask;
		uint32 i = 0;
		for (; i < dict_info_.sync_count; i++) {
			unsigned int off = (syncs_[i] & kUserDictOffsetMask);
			if (off == offset)
				break;
		}
		if (i < dict_info_.sync_count) {
			syncs_[i] = syncs_[dict_info_.sync_count - 1];
			dict_info_.sync_count--;
		}
	}
#endif

#ifdef ___PREDICT_ENABLED___
	void UserDict::remove_lemma_from_predict_list(uint32 offset) {
		offset &= kUserDictOffsetMask;
		uint32 i = 0;
		for (; i < dict_info_.lemma_count; i++) {
			unsigned int off = (predicts_[i] & kUserDictOffsetMask);
			if (off == offset) {
				predicts_[i] |= kUserDictOffsetFlagRemove;
				break;
			}
		}
	}
#endif

	bool UserDict::remove_lemma_by_offset_index(int offset_index) {
		if (is_valid_state() == false)
			return 0;

		int32 off = offset_index;
		if (off == -1) {
			return false;
		}

		uint32 offset = offsets_[off];
		uint32 nchar = get_lemma_nchar(offset);

		offsets_[off] |= kUserDictOffsetFlagRemove;

#ifdef ___SYNC_ENABLED___
		// Remove corresponding sync item
		remove_lemma_from_sync_list(offset);
#endif

#ifdef ___PREDICT_ENABLED___
		remove_lemma_from_predict_list(offset);
#endif
		dict_info_.free_count++;
		dict_info_.free_size += (2 + (nchar << 2));

		if (state_ < USER_DICT_OFFSET_DIRTY)
			state_ = USER_DICT_OFFSET_DIRTY;
		return true;
	}

	bool UserDict::remove_lemma(LemmaIdType lemma_id) {
		if (is_valid_state() == false)
			return 0;
		if (is_valid_lemma_id(lemma_id) == false)
			return false;
		uint32 offset = offsets_by_id_[lemma_id - start_id_];

		uint32 nchar = get_lemma_nchar(offset);
		uint16 * spl = get_lemma_spell_ids(offset);
		uint16 * wrd = get_lemma_word(offset);

		int32 off = locate_in_offsets(wrd, spl, nchar);

		return remove_lemma_by_offset_index(off);
	}

	void UserDict::flush_cache() {
		LemmaIdType start_id = start_id_;
		if (!dict_file_)
			return;
		const char * file = strdup(dict_file_);
		if (!file)
			return;
		close_dict();
		load_dict(file, start_id, kUserDictIdEnd);
		free((void*)file);
#ifdef ___CACHE_ENABLED___
		cache_init();
#endif
		return;
	}

	bool UserDict::reset(const char *file) {
		FILE *fp = fopen(file, "w+");
		if (!fp) {
			return false;
		}
		uint32 version = kUserDictVersion;
		size_t wred = fwrite(&version, 1, 4, fp);
		UserDictInfo info;
		memset(&info, 0, sizeof(info));
		// By default, no limitation for lemma count and size
		// thereby, reclaim_ratio is never used
		wred += fwrite(&info, 1, sizeof(info), fp);
		if (wred != sizeof(info) + sizeof(version)) {
			fclose(fp);
			unlink(file);
			return false;
		}
		fclose(fp);
		return true;
	}

	bool UserDict::validate(const char *file) {
		// b is ignored in POSIX compatible os including Linux
		// while b is important flag for Windows to specify binary mode
		FILE *fp = fopen(file, "rb");
		if (!fp) {
			return false;
		}

		size_t size;
		size_t readed;
		uint32 version;
		UserDictInfo dict_info;

		// validate
		int err = fseek(fp, 0, SEEK_END);
		if (err) {
			goto error;
		}

		size = ftell(fp);
		if (size < 4 + sizeof(dict_info)) {
			goto error;
		}

		err = fseek(fp, 0, SEEK_SET);
		if (err) {
			goto error;
		}

		readed = fread(&version, 1, sizeof(version), fp);
		if (readed < sizeof(version)) {
			goto error;
		}
		if (version != kUserDictVersion) {
			goto error;
		}

		err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
		if (err) {
			goto error;
		}

		readed = fread(&dict_info, 1, sizeof(dict_info), fp);
		if (readed != sizeof(dict_info)) {
			goto error;
		}

		if (size != get_dict_file_size(&dict_info)) {
			goto error;
		}

		fclose(fp);
		return true;

	error:
		fclose(fp);
		return false;
	}

	bool UserDict::load(const char *file, LemmaIdType start_id) {
		if (0 != pthread_mutex_trylock(&g_mutex_)) {
			return false;
		}
		// b is ignored in POSIX compatible os including Linux
		// while b is important flag for Windows to specify binary mode
		FILE *fp = fopen(file, "rb");
		if (!fp) {
			pthread_mutex_unlock(&g_mutex_);
			return false;
		}

		size_t readed, toread;
		UserDictInfo dict_info;
		uint8 *lemmas = NULL;
		uint32 *offsets = NULL;
#ifdef ___SYNC_ENABLED___
		uint32 *syncs = NULL;
#endif
		uint32 *scores = NULL;
		uint32 *ids = NULL;
		uint32 *offsets_by_id = NULL;
#ifdef ___PREDICT_ENABLED___
		uint32 *predicts = NULL;
#endif
		size_t i;
		int err;

		err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
		if (err) goto error;

		readed = fread(&dict_info, 1, sizeof(dict_info), fp);
		if (readed != sizeof(dict_info)) goto error;

		lemmas = (uint8 *)malloc(
			dict_info.lemma_size +
			(kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));

		if (!lemmas) goto error;

		offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
		if (!offsets) goto error;

#ifdef ___PREDICT_ENABLED___
		predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
		if (!predicts) goto error;
#endif

#ifdef ___SYNC_ENABLED___
		syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
		if (!syncs) goto error;
#endif

		scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
		if (!scores) goto error;

		ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
		if (!ids) goto error;

		offsets_by_id = (uint32 *)malloc(
			(dict_info.lemma_count + kUserDictPreAlloc) << 2);
		if (!offsets_by_id) goto error;

		err = fseek(fp, 4, SEEK_SET);
		if (err) goto error;

		readed = 0;
		while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
			readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
		}
		if (readed < dict_info.lemma_size)
			goto error;

		toread = (dict_info.lemma_count << 2);
		readed = 0;
		while (readed < toread && !ferror(fp) && !feof(fp)) {
			readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
		}
		if (readed < toread)
			goto error;

#ifdef ___PREDICT_ENABLED___
		toread = (dict_info.lemma_count << 2);
		readed = 0;
		while (readed < toread && !ferror(fp) && !feof(fp)) {
			readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
		}
		if (readed < toread)
			goto error;
#endif

		readed = 0;
		while (readed < toread && !ferror(fp) && !feof(fp)) {
			readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
		}
		if (readed < toread)
			goto error;

#ifdef ___SYNC_ENABLED___
		toread = (dict_info.sync_count << 2);
		readed = 0;
		while (readed < toread && !ferror(fp) && !feof(fp)) {
			readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
		}
		if (readed < toread)
			goto error;
#endif

		for (i = 0; i < dict_info.lemma_count; i++) {
			ids[i] = start_id + i;
			offsets_by_id[i] = offsets[i];
		}

		lemmas_ = lemmas;
		offsets_ = offsets;
#ifdef ___SYNC_ENABLED___
		syncs_ = syncs;
		sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
#endif
		offsets_by_id_ = offsets_by_id;
		scores_ = scores;
		ids_ = ids;
#ifdef ___PREDICT_ENABLED___
		predicts_ = predicts;
#endif
		lemma_count_left_ = kUserDictPreAlloc;
		lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
		memcpy(&dict_info_, &dict_info, sizeof(dict_info));
		state_ = USER_DICT_SYNC;

		fclose(fp);

		pthread_mutex_unlock(&g_mutex_);
		return true;

	error:
		if (lemmas) free(lemmas);
		if (offsets) free(offsets);
#ifdef ___SYNC_ENABLED___
		if (syncs) free(syncs);
#endif
		if (scores) free(scores);
		if (ids) free(ids);
		if (offsets_by_id) free(offsets_by_id);
#ifdef ___PREDICT_ENABLED___
		if (predicts) free(predicts);
#endif
		fclose(fp);
		pthread_mutex_unlock(&g_mutex_);
		return false;
	}

	void UserDict::write_back() {
		// XXX write back is only allowed from close_dict due to thread-safe sake
		if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
			return;
		int fd = open(dict_file_, O_WRONLY);
		if (fd == -1)
			return;
		switch (state_) {
		case USER_DICT_DEFRAGMENTED:
			write_back_all(fd);
			break;
		case USER_DICT_LEMMA_DIRTY:
			write_back_lemma(fd);
			break;
		case USER_DICT_OFFSET_DIRTY:
			write_back_offset(fd);
			break;
		case USER_DICT_SCORE_DIRTY:
			write_back_score(fd);
			break;
#ifdef ___SYNC_ENABLED___
		case USER_DICT_SYNC_DIRTY:
			write_back_sync(fd);
			break;
#endif
		default:
			break;
		}
		// It seems truncate is not need on Linux, Windows except Mac
		// I am doing it here anyway for safety.
		off_t cur = lseek(fd, 0, SEEK_CUR);
#ifndef _WIN32
		ftruncate(fd, cur);
#endif
		close(fd);
		state_ = USER_DICT_SYNC;
	}

#ifdef ___SYNC_ENABLED___
	void UserDict::write_back_sync(int fd) {
		int err = lseek(fd, 4 + dict_info_.lemma_size
			+ (dict_info_.lemma_count << 3)
#ifdef ___PREDICT_ENABLED___
			+ (dict_info_.lemma_count << 2)
#endif
			, SEEK_SET);
		if (err == -1)
			return;
		write(fd, syncs_, dict_info_.sync_count << 2);
		write(fd, &dict_info_, sizeof(dict_info_));
	}
#endif

	void UserDict::write_back_offset(int fd) {
		int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
		if (err == -1)
			return;
		write(fd, offsets_, dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
		write(fd, predicts_, dict_info_.lemma_count << 2);
#endif
		write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
		write(fd, syncs_, dict_info_.sync_count << 2);
#endif
		write(fd, &dict_info_, sizeof(dict_info_));
	}

	void UserDict::write_back_score(int fd) {
		int err = lseek(fd, 4 + dict_info_.lemma_size
			+ (dict_info_.lemma_count << 2)
#ifdef ___PREDICT_ENABLED___
			+ (dict_info_.lemma_count << 2)
#endif
			, SEEK_SET);
		if (err == -1)
			return;
		write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
		write(fd, syncs_, dict_info_.sync_count << 2);
#endif
		write(fd, &dict_info_, sizeof(dict_info_));
	}

	void UserDict::write_back_lemma(int fd) {
		int err = lseek(fd, 4, SEEK_SET);
		if (err == -1)
			return;
		// New lemmas are always appended, no need to write whole lemma block
		size_t need_write = kUserDictPreAlloc *
			(2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
		err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
		if (err == -1)
			return;
		write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);

		write(fd, offsets_, dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
		write(fd, predicts_, dict_info_.lemma_count << 2);
#endif
		write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
		write(fd, syncs_, dict_info_.sync_count << 2);
#endif
		write(fd, &dict_info_, sizeof(dict_info_));
	}

	void UserDict::write_back_all(int fd) {
		// XXX lemma_size is handled differently in writeall
		// and writelemma. I update lemma_size and lemma_count in different
		// places for these two cases. Should fix it to make it consistent.
		int err = lseek(fd, 4, SEEK_SET);
		if (err == -1)
			return;
		write(fd, lemmas_, dict_info_.lemma_size);
		write(fd, offsets_, dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
		write(fd, predicts_, dict_info_.lemma_count << 2);
#endif
		write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
		write(fd, syncs_, dict_info_.sync_count << 2);
#endif
		write(fd, &dict_info_, sizeof(dict_info_));
	}

#ifdef ___CACHE_ENABLED___
	bool UserDict::load_cache(UserDictSearchable *searchable,
		uint32 *offset, uint32 *length) {
		UserDictCache *cache = &caches_[searchable->splids_len - 1];
		if (cache->head == cache->tail)
			return false;

		uint16 j, sig_len = kMaxLemmaSize / 4;
		uint16 i = cache->head;
		while (1) {
			j = 0;
			for (; j < sig_len; j++) {
				if (cache->signatures[i][j] != searchable->signature[j])
					break;
			}
			if (j < sig_len) {
				i++;
				if (i >= kUserDictCacheSize)
					i -= kUserDictCacheSize;
				if (i == cache->tail)
					break;
				continue;
			}
			*offset = cache->offsets[i];
			*length = cache->lengths[i];
			return true;
		}
		return false;
	}

	void UserDict::save_cache(UserDictSearchable *searchable,
		uint32 offset, uint32 length) {
		UserDictCache *cache = &caches_[searchable->splids_len - 1];
		uint16 next = cache->tail;

		cache->offsets[next] = offset;
		cache->lengths[next] = length;
		uint16 sig_len = kMaxLemmaSize / 4;
		uint16 j = 0;
		for (; j < sig_len; j++) {
			cache->signatures[next][j] = searchable->signature[j];
		}

		if (++next >= kUserDictCacheSize) {
			next -= kUserDictCacheSize;
		}
		if (next == cache->head) {
			cache->head++;
			if (cache->head >= kUserDictCacheSize) {
				cache->head -= kUserDictCacheSize;
			}
		}
		cache->tail = next;
	}

	void UserDict::reset_cache() {
		memset(caches_, 0, sizeof(caches_));
	}

	bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
		UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
		if (cache->head == cache->tail)
			return false;

		uint16 j, sig_len = kMaxLemmaSize / 4;
		uint16 i = cache->head;
		while (1) {
			j = 0;
			for (; j < sig_len; j++) {
				if (cache->signatures[i][j] != searchable->signature[j])
					break;
			}
			if (j < sig_len) {
				i++;
				if (i >= kUserDictMissCacheSize)
					i -= kUserDictMissCacheSize;
				if (i == cache->tail)
					break;
				continue;
			}
			return true;
		}
		return false;
	}

	void UserDict::save_miss_cache(UserDictSearchable *searchable) {
		UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
		uint16 next = cache->tail;

		uint16 sig_len = kMaxLemmaSize / 4;
		uint16 j = 0;
		for (; j < sig_len; j++) {
			cache->signatures[next][j] = searchable->signature[j];
		}

		if (++next >= kUserDictMissCacheSize) {
			next -= kUserDictMissCacheSize;
		}
		if (next == cache->head) {
			cache->head++;
			if (cache->head >= kUserDictMissCacheSize) {
				cache->head -= kUserDictMissCacheSize;
			}
		}
		cache->tail = next;
	}

	void UserDict::reset_miss_cache() {
		memset(miss_caches_, 0, sizeof(miss_caches_));
	}

	void UserDict::cache_init() {
		reset_cache();
		reset_miss_cache();
	}

	bool UserDict::cache_hit(UserDictSearchable *searchable,
		uint32 *offset, uint32 *length) {
		bool hit = load_miss_cache(searchable);
		if (hit) {
			*offset = 0;
			*length = 0;
			return true;
		}
		hit = load_cache(searchable, offset, length);
		if (hit) {
			return true;
		}
		return false;
	}

	void UserDict::cache_push(UserDictCacheType type,
		UserDictSearchable *searchable,
		uint32 offset, uint32 length) {
		switch (type) {
		case USER_DICT_MISS_CACHE:
			save_miss_cache(searchable);
			break;
		case USER_DICT_CACHE:
			save_cache(searchable, offset, length);
			break;
		default:
			break;
		}
	}

#endif

	void UserDict::defragment(void) {
#ifdef ___DEBUG_PERF___
		DEBUG_PERF_BEGIN;
#endif
		if (is_valid_state() == false)
			return;
		// Fixup offsets_, set REMOVE flag to lemma's flag if needed
		size_t first_freed = 0;
		size_t first_inuse = 0;
		while (first_freed < dict_info_.lemma_count) {
			// Find first freed offset
			while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
				first_freed < dict_info_.lemma_count) {
				first_freed++;
			}
			if (first_freed < dict_info_.lemma_count) {
				// Save REMOVE flag to lemma flag
				int off = offsets_[first_freed];
				set_lemma_flag(off, kUserDictLemmaFlagRemove);
			}
			else {
				break;
			}
			// Find first inuse offse after first_freed
			first_inuse = first_freed + 1;
			while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
				(first_inuse < dict_info_.lemma_count)) {
				// Save REMOVE flag to lemma flag
				int off = offsets_[first_inuse];
				set_lemma_flag(off, kUserDictLemmaFlagRemove);
				first_inuse++;
			}
			if (first_inuse >= dict_info_.lemma_count) {
				break;
			}
			// Swap offsets_
			int tmp = offsets_[first_inuse];
			offsets_[first_inuse] = offsets_[first_freed];
			offsets_[first_freed] = tmp;
			// Move scores_, no need to swap
			tmp = scores_[first_inuse];
			scores_[first_inuse] = scores_[first_freed];
			scores_[first_freed] = tmp;
			// Swap ids_
			LemmaIdType tmpid = ids_[first_inuse];
			ids_[first_inuse] = ids_[first_freed];
			ids_[first_freed] = tmpid;
			// Go on
			first_freed++;
		}
#ifdef ___PREDICT_ENABLED___
		// Fixup predicts_
		first_freed = 0;
		first_inuse = 0;
		while (first_freed < dict_info_.lemma_count) {
			// Find first freed offset
			while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
				first_freed < dict_info_.lemma_count) {
				first_freed++;
			}
			if (first_freed >= dict_info_.lemma_count)
				break;
			// Find first inuse offse after first_freed
			first_inuse = first_freed + 1;
			while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
				&& (first_inuse < dict_info_.lemma_count)) {
				first_inuse++;
			}
			if (first_inuse >= dict_info_.lemma_count) {
				break;
			}
			// Swap offsets_
			int tmp = predicts_[first_inuse];
			predicts_[first_inuse] = predicts_[first_freed];
			predicts_[first_freed] = tmp;
			// Go on
			first_freed++;
		}
#endif
		dict_info_.lemma_count = first_freed;
		// Fixup lemmas_
		size_t begin = 0;
		size_t end = 0;
		size_t dst = 0;
		int total_size = dict_info_.lemma_size + lemma_size_left_;
		int total_count = dict_info_.lemma_count + lemma_count_left_;
		size_t real_size = total_size - lemma_size_left_;
		while (dst < real_size) {
			unsigned char flag = get_lemma_flag(dst);
			unsigned char nchr = get_lemma_nchar(dst);
			if ((flag & kUserDictLemmaFlagRemove) == 0) {
				dst += nchr * 4 + 2;
				continue;
			}
			break;
		}
		if (dst >= real_size)
			return;

		end = dst;
		while (end < real_size) {
			begin = end + get_lemma_nchar(end) * 4 + 2;
		repeat:
			// not used any more
			if (begin >= real_size)
				break;
			unsigned char flag = get_lemma_flag(begin);
			unsigned char nchr = get_lemma_nchar(begin);
			if (flag & kUserDictLemmaFlagRemove) {
				begin += nchr * 4 + 2;
				goto repeat;
			}
			end = begin + nchr * 4 + 2;
			while (end < real_size) {
				unsigned char eflag = get_lemma_flag(end);
				unsigned char enchr = get_lemma_nchar(end);
				if ((eflag & kUserDictLemmaFlagRemove) == 0) {
					end += enchr * 4 + 2;
					continue;
				}
				break;
			}
			memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
			for (size_t j = 0; j < dict_info_.lemma_count; j++) {
				if (offsets_[j] >= begin && offsets_[j] < end) {
					offsets_[j] -= (begin - dst);
					offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
				}
#ifdef ___PREDICT_ENABLED___
				if (predicts_[j] >= begin && predicts_[j] < end) {
					predicts_[j] -= (begin - dst);
				}
#endif
			}
#ifdef ___SYNC_ENABLED___
			for (size_t j = 0; j < dict_info_.sync_count; j++) {
				if (syncs_[j] >= begin && syncs_[j] < end) {
					syncs_[j] -= (begin - dst);
				}
			}
#endif
			dst += (end - begin);
		}

		dict_info_.free_count = 0;
		dict_info_.free_size = 0;
		dict_info_.lemma_size = dst;
		lemma_size_left_ = total_size - dict_info_.lemma_size;
		lemma_count_left_ = total_count - dict_info_.lemma_count;

		// XXX Without following code,
		// offsets_by_id_ is not reordered.
		// That's to say, all removed lemmas' ids are not collected back.
		// There may not be room for addition of new lemmas due to
		// offsests_by_id_ reason, although lemma_size_left_ is fixed.
		// By default, we do want defrag as fast as possible, because
		// during defrag procedure, other peers can not write new lemmas
		// to user dictionary file.
		// XXX If write-back is invoked immediately after
		// this defragment, no need to fix up following in-mem data.
		for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
			ids_[i] = start_id_ + i;
			offsets_by_id_[i] = offsets_[i];
		}

		state_ = USER_DICT_DEFRAGMENTED;

#ifdef ___DEBUG_PERF___
		DEBUG_PERF_END;
		LOGD_PERF("defragment");
#endif
	}

#ifdef ___SYNC_ENABLED___
	void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
		if (is_valid_state() == false)
			return;
		if (end > dict_info_.sync_count)
			end = dict_info_.sync_count;
		memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
		dict_info_.sync_count -= (end - start);
		if (state_ < USER_DICT_SYNC_DIRTY)
			state_ = USER_DICT_SYNC_DIRTY;
	}

	int UserDict::get_sync_count() {
		if (is_valid_state() == false)
			return 0;
		return dict_info_.sync_count;
	}

	LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
		uint16 lemma_len, uint16 count, uint64 lmt) {
		int again = 0;
	begin:
		LemmaIdType id;
		uint32 * syncs_bak = syncs_;
		syncs_ = NULL;
		id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
		syncs_ = syncs_bak;
		if (id == 0 && again == 0) {
			if ((dict_info_.limit_lemma_count > 0 &&
				dict_info_.lemma_count >= dict_info_.limit_lemma_count)
				|| (dict_info_.limit_lemma_size > 0 &&
					dict_info_.lemma_size + (2 + (lemma_len << 2))
			> dict_info_.limit_lemma_size)) {
				// XXX Always reclaim and defrag in sync code path
				//     sync thread is background thread and ok with heavy work
				reclaim();
				defragment();
				flush_cache();
				again = 1;
				goto begin;
			}
		}
		return id;
	}

	int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
		int newly_added = 0;

		SpellingParser * spl_parser = new SpellingParser();
		if (!spl_parser) {
			return 0;
		}
#ifdef ___DEBUG_PERF___
		DEBUG_PERF_BEGIN;
#endif
		char16 *ptr = lemmas;

		// Extract pinyin,words,frequence,last_mod_time
		char16 * p = ptr, *py16 = ptr;
		char16 * hz16 = NULL;
		int py16_len = 0;
		uint16 splid[kMaxLemmaSize];
		int splid_len = 0;
		int hz16_len = 0;
		char16 * fr16 = NULL;
		int fr16_len = 0;

		while (p - ptr < len) {
			// Pinyin
			py16 = p;
			splid_len = 0;
			while (*p != 0x2c && (p - ptr) < len) {
				if (*p == 0x20)
					splid_len++;
				p++;
			}
			splid_len++;
			if (p - ptr == len)
				break;
			py16_len = p - py16;
			if (kMaxLemmaSize < splid_len) {
				break;
			}
			bool is_pre;
			int splidl = spl_parser->splstr16_to_idxs_f(
				py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
			if (splidl != splid_len)
				break;
			// Phrase
			hz16 = ++p;
			while (*p != 0x2c && (p - ptr) < len) {
				p++;
			}
			hz16_len = p - hz16;
			if (hz16_len != splid_len)
				break;
			// Frequency
			fr16 = ++p;
			fr16_len = 0;
			while (*p != 0x2c && (p - ptr) < len) {
				p++;
			}
			fr16_len = p - fr16;
			uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
			// Last modified time
			fr16 = ++p;
			fr16_len = 0;
			while (*p != 0x3b && (p - ptr) < len) {
				p++;
			}
			fr16_len = p - fr16;
			uint64 last_mod = utf16le_atoll(fr16, fr16_len);

			put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
			newly_added++;

			p++;
		}

#ifdef ___DEBUG_PERF___
		DEBUG_PERF_END;
		LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
#endif
		return newly_added;
	}

	int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
		char16 * str, int size, int * count) {
		int len = 0;
		*count = 0;

		int left_len = size;

		if (is_valid_state() == false)
			return len;

		SpellingTrie * spl_trie = &SpellingTrie::get_instance();
		if (!spl_trie) {
			return 0;
		}

		uint32 i;
		for (i = 0; i < dict_info_.sync_count; i++) {
			int offset = syncs_[i];
			uint32 nchar = get_lemma_nchar(offset);
			uint16 *spl = get_lemma_spell_ids(offset);
			uint16 *wrd = get_lemma_word(offset);
			int score = _get_lemma_score(wrd, spl, nchar);

			static char score_temp[32], *pscore_temp = score_temp;
			static char16 temp[256], *ptemp = temp;

			pscore_temp = score_temp;
			ptemp = temp;

			uint32 j;
			// Add pinyin
			for (j = 0; j < nchar; j++) {
				int ret_len = spl_trie->get_spelling_str16(
					spl[j], ptemp, temp + sizeof(temp) - ptemp);
				if (ret_len <= 0)
					break;
				ptemp += ret_len;
				if (ptemp < temp + sizeof(temp) - 1) {
					*(ptemp++) = ' ';
				}
				else {
					j = 0;
					break;
				}
			}
			if (j < nchar) {
				continue;
			}
			ptemp--;
			if (ptemp < temp + sizeof(temp) - 1) {
				*(ptemp++) = ',';
			}
			else {
				continue;
			}
			// Add phrase
			for (j = 0; j < nchar; j++) {
				if (ptemp < temp + sizeof(temp) - 1) {
					*(ptemp++) = wrd[j];
				}
				else {
					break;
				}
			}
			if (j < nchar) {
				continue;
			}
			if (ptemp < temp + sizeof(temp) - 1) {
				*(ptemp++) = ',';
			}
			else {
				continue;
			}
			// Add frequency
			uint32 intf = extract_score_freq(score);
			int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
			if (ret_len <= 0)
				continue;
			ptemp += ret_len;
			if (ptemp < temp + sizeof(temp) - 1) {
				*(ptemp++) = ',';
			}
			else {
				continue;
			}
			// Add last modified time
			uint64 last_mod = extract_score_lmt(score);
			ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
			if (ret_len <= 0)
				continue;
			ptemp += ret_len;
			if (ptemp < temp + sizeof(temp) - 1) {
				*(ptemp++) = ';';
			}
			else {
				continue;
			}

			// Write to string
			int need_len = ptemp - temp;
			if (need_len > left_len)
				break;
			memcpy(str + len, temp, need_len * 2);
			left_len -= need_len;

			len += need_len;
			(*count)++;
		}

		if (len > 0) {
			if (state_ < USER_DICT_SYNC_DIRTY)
				state_ = USER_DICT_SYNC_DIRTY;
		}
		return len;
	}

#endif

	bool UserDict::state(UserDictStat * stat) {
		if (is_valid_state() == false)
			return false;
		if (!stat)
			return false;
		stat->version = version_;
		stat->file_name = dict_file_;
		stat->load_time.tv_sec = load_time_.tv_sec;
		stat->load_time.tv_usec = load_time_.tv_usec;
		pthread_mutex_lock(&g_mutex_);
		stat->last_update.tv_sec = g_last_update_.tv_sec;
		stat->last_update.tv_usec = g_last_update_.tv_usec;
		pthread_mutex_unlock(&g_mutex_);
		stat->disk_size = get_dict_file_size(&dict_info_);
		stat->lemma_count = dict_info_.lemma_count;
		stat->lemma_size = dict_info_.lemma_size;
		stat->delete_count = dict_info_.free_count;
		stat->delete_size = dict_info_.free_size;
#ifdef ___SYNC_ENABLED___
		stat->sync_count = dict_info_.sync_count;
#endif
		stat->limit_lemma_count = dict_info_.limit_lemma_count;
		stat->limit_lemma_size = dict_info_.limit_lemma_size;
		stat->reclaim_ratio = dict_info_.reclaim_ratio;
		return true;
	}

	void UserDict::set_limit(uint32 max_lemma_count,
		uint32 max_lemma_size, uint32 reclaim_ratio) {
		dict_info_.limit_lemma_count = max_lemma_count;
		dict_info_.limit_lemma_size = max_lemma_size;
		if (reclaim_ratio > 100)
			reclaim_ratio = 100;
		dict_info_.reclaim_ratio = reclaim_ratio;
	}

	void UserDict::reclaim() {
		if (is_valid_state() == false)
			return;

		switch (dict_info_.reclaim_ratio) {
		case 0:
			return;
		case 100:
			// TODO: CLEAR to be implemented
			assert(false);
			return;
		default:
			break;
		}

		// XXX Reclaim is only based on count, not size
		uint32 count = dict_info_.lemma_count;
		int rc = count * dict_info_.reclaim_ratio / 100;

		UserDictScoreOffsetPair * score_offset_pairs = NULL;
		score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
			sizeof(UserDictScoreOffsetPair) * rc);
		if (score_offset_pairs == NULL) {
			return;
		}

		for (int i = 0; i < rc; i++) {
			int s = scores_[i];
			score_offset_pairs[i].score = s;
			score_offset_pairs[i].offset_index = i;
		}

		for (int i = (rc + 1) / 2; i >= 0; i--)
			shift_down(score_offset_pairs, i, rc);

		for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
			int s = scores_[i];
			if (s < score_offset_pairs[0].score) {
				score_offset_pairs[0].score = s;
				score_offset_pairs[0].offset_index = i;
				shift_down(score_offset_pairs, 0, rc);
			}
		}

		for (int i = 0; i < rc; i++) {
			int off = score_offset_pairs[i].offset_index;
			remove_lemma_by_offset_index(off);
		}
		if (rc > 0) {
			if (state_ < USER_DICT_OFFSET_DIRTY)
				state_ = USER_DICT_OFFSET_DIRTY;
		}

		free(score_offset_pairs);
	}

	inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
		int s = sop[i].score;
		int p = sop[i].offset_index;
		sop[i].score = sop[j].score;
		sop[i].offset_index = sop[j].offset_index;
		sop[j].score = s;
		sop[j].offset_index = p;
	}

	void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
		int par = i;
		while (par < n) {
			int left = par * 2 + 1;
			int right = left + 1;
			if (left >= n && right >= n)
				break;
			if (right >= n) {
				if (sop[left].score > sop[par].score) {
					swap(sop, left, par);
					par = left;
					continue;
				}
			}
			else if (sop[left].score > sop[right].score &&
				sop[left].score > sop[par].score) {
				swap(sop, left, par);
				par = left;
				continue;
			}
			else if (sop[right].score > sop[left].score &&
				sop[right].score > sop[par].score) {
				swap(sop, right, par);
				par = right;
				continue;
			}
			break;
		}
	}

	LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
		uint16 lemma_len, uint16 count) {
		return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
	}

	LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
		uint16 lemma_len, uint16 count, uint64 lmt) {
#ifdef ___DEBUG_PERF___
		DEBUG_PERF_BEGIN;
#endif
		if (is_valid_state() == false)
			return 0;
		int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
		if (off != -1) {
			int delta_score = count - scores_[off];
			dict_info_.total_nfreq += delta_score;
			scores_[off] = build_score(lmt, count);
			if (state_ < USER_DICT_SCORE_DIRTY)
				state_ = USER_DICT_SCORE_DIRTY;
#ifdef ___DEBUG_PERF___
			DEBUG_PERF_END;
			LOGD_PERF("_put_lemma(update)");
#endif
			return ids_[off];
		}
		else {
			if ((dict_info_.limit_lemma_count > 0 &&
				dict_info_.lemma_count >= dict_info_.limit_lemma_count)
				|| (dict_info_.limit_lemma_size > 0 &&
					dict_info_.lemma_size + (2 + (lemma_len << 2))
			> dict_info_.limit_lemma_size)) {
				// XXX Don't defragment here, it's too time-consuming.
				return 0;
			}
			int flushed = 0;
			if (lemma_count_left_ == 0 ||
				lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {

				// XXX When there is no space for new lemma, we flush to disk
				// flush_cache() may be called by upper user
				// and better place shoule be found instead of here
				flush_cache();
				flushed = 1;
				// Or simply return and do nothing
				// return 0;
			}
#ifdef ___DEBUG_PERF___
			DEBUG_PERF_END;
			LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
#endif
			LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
#ifdef ___SYNC_ENABLED___
			if (syncs_ && id != 0) {
				queue_lemma_for_sync(id);
			}
#endif
			return id;
		}
		return 0;
	}

#ifdef ___SYNC_ENABLED___
	void UserDict::queue_lemma_for_sync(LemmaIdType id) {
		if (dict_info_.sync_count < sync_count_size_) {
			syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
		}
		else {
			uint32 * syncs = (uint32*)realloc(
				syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
			if (syncs) {
				sync_count_size_ += kUserDictPreAlloc;
				syncs_ = syncs;
				syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
			}
		}
	}
#endif

	LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
		bool selected) {
#ifdef ___DEBUG_PERF___
		DEBUG_PERF_BEGIN;
#endif
		if (is_valid_state() == false)
			return 0;
		if (is_valid_lemma_id(lemma_id) == false)
			return 0;
		uint32 offset = offsets_by_id_[lemma_id - start_id_];
		uint8 lemma_len = get_lemma_nchar(offset);
		char16 * lemma_str = get_lemma_word(offset);
		uint16 * splids = get_lemma_spell_ids(offset);

		int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
		if (off != -1) {
			int score = scores_[off];
			int count = extract_score_freq(score);
			uint64 lmt = extract_score_lmt(score);
			if (count + delta_count > kUserDictMaxFrequency ||
				count + delta_count < count) {
				delta_count = kUserDictMaxFrequency - count;
			}
			count += delta_count;
			dict_info_.total_nfreq += delta_count;
			if (selected) {
				lmt = time(NULL);
			}
			scores_[off] = build_score(lmt, count);
			if (state_ < USER_DICT_SCORE_DIRTY)
				state_ = USER_DICT_SCORE_DIRTY;
#ifdef ___DEBUG_PERF___
			DEBUG_PERF_END;
			LOGD_PERF("update_lemma");
#endif
#ifdef ___SYNC_ENABLED___
			queue_lemma_for_sync(ids_[off]);
#endif
			return ids_[off];
		}
		return 0;
	}

	size_t UserDict::get_total_lemma_count() {
		return dict_info_.total_nfreq;
	}

	void UserDict::set_total_lemma_count_of_others(size_t count) {
		total_other_nfreq_ = count;
	}

	LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
		uint16 lemma_len, uint16 count, uint64 lmt) {
		LemmaIdType id = get_max_lemma_id() + 1;
		size_t offset = dict_info_.lemma_size;
		if (offset > kUserDictOffsetMask)
			return 0;

		lemmas_[offset] = 0;
		lemmas_[offset + 1] = (uint8)lemma_len;
		for (size_t i = 0; i < lemma_len; i++) {
			*((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
			*((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
				= lemma_str[i];
		}
		uint32 off = dict_info_.lemma_count;
		offsets_[off] = offset;
		scores_[off] = build_score(lmt, count);
		ids_[off] = id;
#ifdef ___PREDICT_ENABLED___
		predicts_[off] = offset;
#endif

		offsets_by_id_[id - start_id_] = offset;

		dict_info_.lemma_count++;
		dict_info_.lemma_size += (2 + (lemma_len << 2));
		lemma_count_left_--;
		lemma_size_left_ -= (2 + (lemma_len << 2));

		// Sort

		UserDictSearchable searchable;
		prepare_locate(&searchable, splids, lemma_len);

		size_t i = 0;
		while (i < off) {
			offset = offsets_[i];
			uint32 nchar = get_lemma_nchar(offset);
			uint16 * spl = get_lemma_spell_ids(offset);

			if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
				break;
			i++;
		}
		if (i != off) {
			uint32 temp = offsets_[off];
			memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
			offsets_[i] = temp;

			temp = scores_[off];
			memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
			scores_[i] = temp;

			temp = ids_[off];
			memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
			ids_[i] = temp;
		}

#ifdef ___PREDICT_ENABLED___
		uint32 j = 0;
		uint16 * words_new = get_lemma_word(predicts_[off]);
		j = locate_where_to_insert_in_predicts(words_new, lemma_len);
		if (j != off) {
			uint32 temp = predicts_[off];
			memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
			predicts_[j] = temp;
		}
#endif

		if (state_ < USER_DICT_LEMMA_DIRTY)
			state_ = USER_DICT_LEMMA_DIRTY;

#ifdef ___CACHE_ENABLED___
		cache_init();
#endif

		dict_info_.total_nfreq += count;
		return id;
	}
}
